Workshop Day 2B | 2022-07-26
Jeffrey M. Girard | Pitt Methods
A data visualization expresses data through visual aesthetics.
[2B] Visualize I
Some simple graphics are easy to describe and may even have ready names.
[2B] Visualize I
A grammar of graphics will help us describe more complex graphics.
[2B] Visualize I
[2B] Visualize I
Graphics require data (e.g., tibbles), which describe observations using variables.
[2B] Visualize I
Graphics require aesthetic mappings, which connect data variables to visual qualities.
[2B] Visualize I
Graphics require scales, which connect specific data values to specific aesthetic values.
[2B] Visualize I
Graphics require geometric objects (geoms), which represent the observations.
[2B] Visualize I
+ rather than |>[2B] Visualize I
# SETUP: We will need tidyverse and an example dataset
library(tidyverse)
mpg
# ==============================================================================
# LESSON: First, set the data to a tibble
p <- ggplot(data = mpg)
p
# ==============================================================================
# LESSON: Next, set the aesthetic mappings with aes()
p <- ggplot(data = mpg, mapping = aes(x = displ, y = hwy))
p
# ==============================================================================
# TIP: You can leave off the optional argument names
p <- ggplot(mpg, aes(x = displ, y = hwy))
p
# ==============================================================================
# LESSON: Next, set the positional scales
p <- ggplot(mpg, aes(x = displ, y = hwy)) +
scale_x_continuous(
name = "Engine Size (in liters)",
limits = c(1, 7),
breaks = 1:7
) +
scale_y_continuous(
name = "Highway Fuel Efficiency (in miles/gallon)",
limits = c(10, 50),
breaks = c(10, 20, 30, 40, 50)
)
p
# ==============================================================================
# LESSON: Finally, add a point geom
p <-
ggplot(mpg, aes(x = displ, y = hwy)) +
scale_x_continuous(
name = "Engine Size (in liters)",
limits = c(1, 7),
breaks = 1:7
) +
scale_y_continuous(
name = "Highway Fuel Efficiency (in miles/gallon)",
limits = c(10, 50),
breaks = c(10, 20, 30, 40, 50)
) +
geom_point()
# ==============================================================================
# TIP: If you leave off the scales, R will try to guess
p <- ggplot(mpg, aes(x = displ, y = hwy)) + geom_point()
p
# ==============================================================================
# LESSON: We can also customize the geom with arguments
p <- ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(color = "red", shape = "square", size = 2)
p[2B] Visualize I
[2B] Visualize I
# SETUP: We will need tidyverse and an example dataset
library(tidyverse)
mpg
# ==============================================================================
# USECASE: Add a smooth geom (i.e., line of best fit)
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth()
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth(method = "lm")
# ==============================================================================
# USECASE: Add a line geom (i.e., connecting points)
economics
ggplot(economics, aes(x = date, y = unemploy)) +
geom_point()
ggplot(economics, aes(x = date, y = unemploy)) +
geom_point() +
geom_line(color = "orange", size = 1)
ggplot(economics, aes(x = date, y = unemploy)) +
geom_line(color = "orange", size = 1) +
geom_point()
# ==============================================================================
# USECASE: Add reference line geoms
ggplot(economics, aes(x = date, y = unemploy)) +
geom_hline(yintercept = 0, color = "orange", size = 1) +
geom_line(color = "blue", size = 1) +
geom_point()
ggplot(economics, aes(x = date, y = unemploy)) +
geom_vline(xintercept = 7.5, color = "orange", size = 1) +
geom_line(color = "blue", size = 1) +
geom_point()
ggplot(economics, aes(x = date, y = unemploy)) +
geom_abline(intercept = 4000, slope = 0.5, color = "orange", size = 1) +
geom_line(color = "blue", size = 1) +
geom_point() [2B] Visualize I
[2B] Visualize I
# SETUP: We will need tidyverse and an example dataset
library(tidyverse)
mpg
# ==============================================================================
# USECASE: Creating histograms
ggplot(mpg, aes(x = hwy)) +
geom_histogram()
ggplot(mpg, aes(x = hwy)) +
geom_histogram(bins = 20)
ggplot(mpg, aes(x = hwy)) +
geom_histogram(binwidth = 2)
ggplot(mpg, aes(x = hwy)) +
geom_histogram(binwidth = 2, color = "red", size = 1)
ggplot(mpg, aes(x = hwy)) +
geom_histogram(binwidth = 2, color = "red", size = 1, fill = "white")
# ==============================================================================
# USECASE: Creating density plots
ggplot(mpg, aes(x = hwy)) + geom_density()
ggplot(mpg, aes(x = hwy)) +
geom_density(color = "red", size = 1, fill = "white")
# ==============================================================================
# USECASE: Creating box plots
ggplot(mpg, aes(x = hwy)) + geom_boxplot()
ggplot(mpg, aes(x = hwy, y = class)) +
geom_boxplot(varwidth = TRUE)
# ==============================================================================
# USECASE: Creating bar plots to count categorical variables
ggplot(mpg, aes(x = class)) + geom_bar()
# ==============================================================================
# PITFALL: Don't try to create histograms for categorical variables
ggplot(mpg, aes(x = class)) + geom_histogram() #error[2B] Visualize I
# SETUP: We will need tidyverse and an example dataset
library(tidyverse)
mpg
# ==============================================================================
# USECASE: Continuous color scales work well with numeric variables
ggplot(mpg, aes(x = hwy, y = cty, color = displ)) +
geom_point(size = 4)
ggplot(mpg, aes(x = hwy, y = cty, color = displ)) +
geom_point(size = 4) +
scale_color_continuous(type = "viridis")
# ==============================================================================
# USECASE: Use a discrete color scale with categorical variables
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point()
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
scale_color_discrete(
name = "Drivetrain",
breaks = c("4", "f", "r"),
labels = c("Four Wheel", "Front Wheel", "Rear Wheel")
)
# ==============================================================================
# PITFALL: Don't forget to set categorical variables as factors
ggplot(mpg, aes(x = displ, y = hwy, color = cyl)) +
geom_point() # R guesses you want a continuous scale
ggplot(mpg, aes(x = displ, y = hwy, color = factor(cyl))) +
geom_point() +
scale_color_discrete(name = "Cylinders")
# ==============================================================================
# LESSON: Set a geom's color aesthetic to make it always that color
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point(color = "red")
# ==============================================================================
# PITFALL: However, do this inside of geom() not aes()
ggplot(mpg, aes(x = displ, y = hwy, color = "blue")) +
geom_point() #unintended
# ==============================================================================
# LESSON: If you both set and map color, the setting will win
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point(color = "blue") color controls the outline colorfill controls the internal colorsize controls the line thickness[2B] Visualize I
# SETUP: We will need tidyverse and an example dataset
library(tidyverse)
mpg
# ==============================================================================
# USECASE: Mapping the shape aesthetic to a categorical variable
ggplot(mpg, aes(x = displ, y = hwy, shape = drv)) +
geom_point(size = 3)
# ==============================================================================
# PITFALL: Don't try to map shape to a continuous variable
ggplot(mpg, aes(x = displ, y = hwy, shape = hwy)) +
geom_point() #error
# NOTE: This doesn't work because there are way more numbers than shapes
# ==============================================================================
# LESSON: Color vs. Fill and Size for Blocks
ggplot(mpg, aes(y = class)) +
geom_bar()
ggplot(mpg, aes(y = class)) +
geom_bar(color = "darkred", fill = "lightblue", size = 1)
# ==============================================================================
# LESSON: Some aesthetics cause grouping when mapped to a categorical variable
ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() +
geom_smooth(method = "lm") # single smooth
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
geom_smooth(method = "lm") # three smooths
# ==============================================================================
# USECASE: Mapping to the fill aesthetic and setting the alpha property
ggplot(mpg, aes(x = hwy, fill = drv)) +
geom_density()
ggplot(mpg, aes(x = hwy, fill = drv)) +
geom_density(alpha = 0.3)
# ==============================================================================
# TIP: If you map the same variable to multiple aesthetics, you get redundancy
ggplot(mpg, aes(x = displ, y = hwy, shape = drv, color = drv)) +
geom_point(size = 3) # if color fails, shape still works[2B] Visualize I
Data
starwars {tidyverse}Aesthetics/Scales
height to X (continuous)mass to Y (continuous)Geoms
[2B] Visualize II
Data
mpg {tidyverse}Aesthetics/Scales
displ to X (continuous)hwy to Y (continuous)drv to color (discrete)Geoms
[2B] Visualize II
Data
mpg {tidyverse}Aesthetics/Scales
hwy to X (continuous)class to Y (discrete)Geoms
[2B] Visualize II
Data
flights {nycflights13}Aesthetics/Scales
origin to X (discrete)origin to color (discrete)count to Y (stat from geom)Geoms
[2B] Visualize II
theme_apa()theme() and this reference[2B] Visualize II
# SETUP: We will need tidyverse and an example graphic
library(tidyverse)
p <-
ggplot(mpg, aes(x = displ, y = hwy, color = drv)) +
geom_point() +
labs(title = "Fuel Efficiency")
p
# ==============================================================================
# USECASE: Apply a "complete" theme
p + theme_bw()
p + theme_classic()
# ==============================================================================
# TIP: You can quickly change the font size of all elements with base_size
p + theme_grey(base_size = 24)
# ==============================================================================
# LESSON: The ggthemes package adds some fun complete themes
library(ggthemes)
p + theme_wsj()
p + theme_economist()
p + theme_stata()
# ==============================================================================
# LESSON: More more precise control, we can use theme()
p + theme(legend.position = "top")
p + theme(plot.title = element_text(color = "purple", face = "bold"))
p + theme(panel.grid = element_blank())
# NOTE: There are a lot of elements to learn, so use a cheatsheet![2B] Visualize II
ggsave()
.png for most daily purposes
.pdf or .svg[2B] Visualize II
# SETUP: We will need tidyverse and an example graphic
library(tidyverse)
p <- ggplot(mpg, aes(x = displ, y = hwy)) +
geom_point() + geom_smooth() +
labs(x = "Engine Displacement", y = "Highway MPG")
p
# ==============================================================================
# USECASE: Save a specific ggplot object to a file
ggsave(filename = "pfinal.png", plot = p)
# ==============================================================================
# LESSON: Specify the size of the file to create
ggsave(filename = "pfinal2.png", plot = p,
width = 6, height = 3, units = "in")
# ==============================================================================
# LESSON: Just change the extension to create a different file type
ggsave(filename = "pfinal2.pdf", plot = p,
width = 6, height = 3, units = "in")
# ==============================================================================
# PITFALL: Creating a very large file may lead to small text
ggsave(filename = "p_poster.png", plot = p,
width = 12, height = 8, units = "in")
# ==============================================================================
# TIP: You can quickly increase the text size using base_size
p2 <- p + theme_grey(base_size = 24)
ggsave(filename = "p_poster2.png", plot = p2,
width = 12, height = 8, units = "in")[2B] Visualize II